package com.hargasembako.core.util;


import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;


public class SearchJanitorUtils {
	private static final Logger log = Logger.getLogger(SearchJanitorUtils.class.getName());
    
    /** From StopAnalyzer Lucene 2.9.1 */
    public final static String[] stopWords = new String[]{
                "a", "an", "and", "are", "as", "at", "be", "but", "by",
                "for", "if", "in", "into", "is", "it",
                "no", "not", "of", "on", "or", "such",
                "that", "the", "their", "then", "there", "these",
                "they", "this", "to", "was", "will", "with"
              };
    
    /**
     * Uses english stemming (snowball + lucene) + stopwords for getting the words.
     * 
     * @param index
     * @return
     */
    public static Set<String> getTokensForIndexingOrQuery( String index_raw, int maximumNumberOfTokensToReturn) {
            
    	String indexCleanedOfHTMLTags = index_raw.replaceAll("\\<.*?>"," ");
    	Set<String> returnSet = new HashSet<String>();
            
    	try {
    		Analyzer analyzer =  new SnowballAnalyzer("English",stopWords);
    		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(indexCleanedOfHTMLTags));

    		Token token = new Token();            
    		while (((token = tokenStream.next()) != null)
    				&& (returnSet.size() < maximumNumberOfTokensToReturn)) {
    			
    			returnSet.add(token.term());
                    
    		}
    	} catch (IOException e) {
    		log.severe(e.getMessage());
    	}
            
    	return returnSet;
    }
}
